Parkinson’s disease (PD) is a neurodegenerative disor- der of central nervous system that causes partial or full loss in motor reflexes, speech, behavior, mental processing, and other vital functions. In 1817, PD was described as “shaking palsy” by Doctor James Parkinson . It is generally observed in elderly people and causes disorders in speech and motor abil- ities (writing, balance, etc.) of 90% of the patients . Ensuing Alzheimer, PD is the second common neurological health prob- lem in elder ages and it is estimated that nearly 10 million people all around the world and approximately 100 000 in Turkey are suffering from this disease . Particularly, PD is generally seen in one out of every hundred people aged over 65. Currently, there is no known cure for the disease . Although, there is significant amount of drug therapies to decrease difficulties caused by the disorder, PD is usually diagnosed and treated using invasive methods . Therefore, this complicates the process of diagnosis and treatment of patients who are grieving from the disease.\ \ Reference: Collection and Analysis of a Parkinson Speech Dataset With Multiple Types of Sound Recordings Betul Erdogdu Sakar, M. Erdem Isenkul, C. Okan Sakar, Ahmet Sertbas, Fikret Gurgen, Sakir Delil, Hulya Apaydin, and Olcay Kursun.\ \ IEEE JOURNAL OF BIOMEDICAL AND HEALTH INFORMATICS, VOL. 17, NO. 4, JULY 2013
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from pandas_profiling import ProfileReport
import ipywidgets as widgets
import itertools
from scipy.stats import shapiro
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
import re
import random
from tqdm import tqdm
from IPython.display import clear_output
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
%matplotlib inline
plt.rcParams["figure.figsize"] = (15,10)
pd.set_option('display.max_colwidth', 500)
df = pd.read_csv("Data - Parkinsons")
df.head()
df.shape
df.info(verbose=True)
df.describe().T
# Describe numeric variable against the target variables 'status'
columns_to_show = [cols for cols in df.columns if cols not in ('status', 'name')]
[display(f"{inds} : {cols}",df.groupby(['status'])[cols].agg([np.mean,min,max,np.std]).T) for inds,cols in enumerate(columns_to_show)];
if 'status' not in columns_to_show:
columns_to_show.append('status')
else:
pass
pd.DataFrame(data={'NANs':list(df[columns_to_show].isna().sum()),
'Zeros':[df[df[i] == 0 ][i].count() for i in columns_to_show],
'Negatives':[df[df[j] < 0 ][j].count() for j in columns_to_show]},
index=columns_to_show)
## Function to create the list of continuous and categorical variables.
def create_variable_list(df,min_cats,stats=False):
'''
Function for categoring the continuous and categorical columns
Usage:
create_variable_list(df,min_cats,stats):
(1) df : Pandas data frame.
(2) min_cats : Threshold for a variables to be deemed as a Categorical Variable
(3) stats :( dafult=False) Return a table of variable name, variable type, number of unique values and list of unique values.
Return : Returns two lists each with categorical and continuous variables respectively.
Also if stats=True, then return a table of data types
'''
categorical =[]
continuous = []
objects = []
var_df = pd.DataFrame(columns=['Variable',
'Type',
'Categorical_Class',
'Uniques',
'N-Uniques'])
for col in df.columns:
if (df[col].dtype.name == 'int64' or df[col].dtype.name == 'float64'):
if df[col].nunique() > min_cats :
continuous.append(col)
else:
categorical.append(col)
elif (df[col].dtype.name == 'category'):
categorical.append(col)
else:
objects.append(col)
if stats == True :
for cats in categorical:
if df[cats].nunique() == 2 :
cat_class = 'Binary'
else:
cat_class = 'Multi'
var_df = var_df.append({'Variable' : cats,
'Type' :'Categorical',
'Categorical_Class':cat_class,
'Uniques': df[cats].unique(),
'N-Uniques': len(df[cats].unique())},
ignore_index=True)
for conts in continuous:
var_df = var_df.append({'Variable' : conts,
'Type' :'Continuous',
#'Uniques': df[conts].unique(),
'N-Uniques': len(df[conts].unique())},
ignore_index=True)
for obs in objects:
var_df = var_df.append({'Variable' : obs,
'Type' :'Objects',
#'Uniques': df[conts].unique(),
'N-Uniques': len(df[obs].unique())},
ignore_index=True)
return categorical,continuous,var_df
else:
return categorical,continuous
_,_,cats_df1= create_variable_list(df,10,stats=True)
cats_df1
# Heatmap of of all variables.
fig,ax = plt.subplots(figsize=(20,10))
sns.heatmap(df.corr(),annot=True);
plt.title("\nHeatmap of correlations between all features\n",{'fontsize':20});
if 'status' in columns_to_show:
columns_to_show.remove('status')
else:
pass
correlations = df[columns_to_show].corrwith(df['status'],method='pearson').to_frame()
sorted_correlations = correlations.sort_values(0,ascending=False)
fig, ax = plt.subplots(figsize=(10,15))
sns.heatmap(sorted_correlations, cmap='coolwarm', annot=True, vmin=-1, vmax=1, ax=ax);
plt.title("\nCorrelation of Status with other predictors\n",fontsize=20);
#Statistical Significance of correlation of predictirs with independent variable i.e 'Personal Loan'
alpha = 0.05
print("Statistical significance of correlation between predictors and Personal Loan\n=========================================================")
print(f"Level of Significance(alpha) :{alpha}\n")
cols_to_drop = ['status','name']
columns_to_test = [cols for cols in df.columns if cols not in cols_to_drop]
# print(columns_to_test)
p_col,coef_col,feature=[],[],[]
for cols in columns_to_test:
pearson_coef, p_value=stats.pearsonr(df[cols], df['status'])
if p_value < alpha :
feature.append(cols)
p_col.append(p_value)
coef_col.append(pearson_coef)
pearson_corr_df = pd.DataFrame({'Features':feature,
'Pearson_Coefficients':coef_col,
'P_Value':p_col}).sort_values(by='Pearson_Coefficients',ascending=False)
pearson_corr_df.set_index('Features',inplace=True)
pearson_corr_df
## Determine Correlation coefficient for each pair of predictors and their statistical significance
pd.set_option('display.max_rows', 500)
alpha = 0.05
# corr_thresh = -1
xy_pairs = [_ for _ in itertools.combinations(columns_to_show,2)]
corr_df = pd.DataFrame(columns=['Variable 1','Variable 2','Corr_Coeff','P_Value'])
for pairs in xy_pairs:
corr_0 = df[pairs[0]].corr(df[pairs[1]])
pearson_coef, p_value=stats.pearsonr(df[pairs[0]], df[pairs[1]])
if p_value < alpha:
corr_df = corr_df.append({'Variable 1':pairs[0],'Variable 2':pairs[1],'Corr_Coeff':corr_0,'P_Value':p_value},ignore_index=True)
print(f"Minimum Correlation Coefficient: {corr_df['Corr_Coeff'].min()} \nMaximum Correlation Coefficient: {corr_df['Corr_Coeff'].max()}")
display(corr_df)
def get_VIF_Table(df):
'''
Function to get the Variance Inflation Factor from the data frame:
Usage: get_VIF_Table(pd.DataFrame)
(1) df : Dataframe
Return:
Returns the Table containing the list of column names or features and VIF for each of them.
'''
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = df
vif_data = pd.DataFrame()
vif_data["feature"] = df.columns
vif_data["VIF"] =[variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
vif_data['VIF'] = vif_data['VIF'].map(lambda a: '%2.2f' % a)
return vif_data
vif_df = get_VIF_Table(df[columns_to_show])
vif_df['VIF'] = vif_df['VIF'].astype('float64')
vif_df.sort_values(by='VIF',inplace=True,ascending=False)
vif_df
new_cols = vif_df[vif_df['VIF'] < 100]['feature'].values
vif_df = get_VIF_Table(df[new_cols])
vif_df['VIF'] = vif_df['VIF'].astype('float64')
vif_df.sort_values(by='VIF',inplace=True,ascending=False)
vif_df
# sns.set(font_scale=1.2)
plots = [_ for _ in itertools.combinations(columns_to_show,2)]
num_cols=2
for i in tqdm(range(0,len(plots),2)):
fig, ax = plt.subplots(1,num_cols,figsize=(20,4))
sns.regplot(plots[i][0],
plots[i][1],
data=df,
fit_reg=1,
line_kws={'color': 'red'},
ax=ax[0]);
ax[0].set_title(f"\n({i+1}) {plots[i][0]} / {plots[i][1]} \n ρ = {round(df[plots[i][0]].corr(df[plots[i][1]]),2)}\n",fontdict=dict(fontsize=20))
if (len(plots)-i) >=2 :
sns.regplot(plots[i+1][0],
plots[i+1][1],
data=df,
fit_reg=1,
line_kws={'color': 'red'},
ax=ax[1]);
ax[1].set_title(f"\n({i+2}) {plots[i+1][0]} / {plots[i+1][1]} \n ρ = {round(df[plots[i+1][0]].corr(df[plots[i+1][1]]),2)}\n",fontdict=dict(fontsize=20))
clear_output(wait=True)
fig.show()
#re-running the categorisation function
cat_var_list,cont_var_list,cats_df1= create_variable_list(df,10,stats=True)
cats_df1
def get_Normality_Check(df):
'''
This function uses Shpiro-Wilk method to test Normality at an alpha of 0.05.
Ho: The sample data is Normally distributed.
H1: The sample data is not Normally distributed.
Usage:
1) df : Data Frame with all features who's normality needs to be tested.
Returns:
1) DataFrame with Features, Normality (yes/No) and corresponding P_value as columns.
'''
alpha = 0.05
normal,p_value,cols=[],[],[]
for i in df.columns:
if (df[i].dtype.name == 'int64' or df[i].dtype.name == 'float64'):
data = df[i]
if shapiro(data)[1] > alpha :
normal.append('Yes')
else:
normal.append('No')
p_value.append(round(shapiro(data)[1],8))
cols.append(i)
else:
pass
normality_df = pd.DataFrame({'Features':cols,'Normality':normal,'P-Value':p_value})
normality_df.set_index('Features',inplace=True)
return(normality_df)
#Testing Only
#=============
# get_Normality_Check(df)
sncp_c1 = sns.color_palette("muted", 40)
if 'name' in cont_var_list:
cont_var_list.remove('name')
else:
pass
num_cols=2
for color,index in enumerate(tqdm(range(0,len(cont_var_list)))):
fig, ax = plt.subplots(1,num_cols,figsize=(20,4))
sns.boxplot(df[cont_var_list[index]],color=sncp_c1[color],ax=ax[0]);
ax[0].set_title(f'({index}) Boxplot of {cont_var_list[index]}\n',{'fontsize':15})
sns.distplot(df[cont_var_list[index]],kde=True,color=sncp_c1[color],ax=ax[1]);
ax[1].set_title(f'({index+1}) Histogram of {cont_var_list[index]}\n',{'fontsize':15})
clear_output(wait=True)
plt.show()
print("\nBin'd data are not displayed in this chart\n")
get_Normality_Check(df)
# Convert numeric variables into categorical variables
for cont_var in cont_var_list:
df[cont_var+'-Bin'] = pd.cut(df[cont_var],bins=10,precision=0,right=True)
df.head()
#re-running the categorisation function
cat_var_list,cont_var_list,cats_df1= create_variable_list(df,10,stats=True)
cats_df1
# Remove the variable 'status' from the list holding categorical variable if present
if 'status' in cat_var_list:
cat_var_list.remove('status')
cat_var_list
sncp_c2 = sns.color_palette("bright", 200)
num_cols=2
idx = 0
for color,index in enumerate(tqdm(range(0,len(cat_var_list),2))):
fig, ax = plt.subplots(1,num_cols,figsize=(20,4))
sns.countplot(df[cat_var_list[index]],hue=df['status'],color=sncp_c2[random.randint(0,100)],ax=ax[0])
title_txt = re.sub(r'-Bin',"",cat_var_list[index])
ax[0].set_title(f"({index+1}) Distribution of {cat_var_list[index]} with and without Parkinson's Disease\n",{'fontsize':15})
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation = 45)
if (len(cat_var_list) - idx) >=2 :
sns.countplot(df[cat_var_list[index+1]],hue=df['status'],color=sncp_c2[random.randint(0,100)],ax=ax[1]);
ax[1].set_title(f"({index+2}) Distribution of {cat_var_list[index+1]} with and without Parkinson's Disease\n",{'fontsize':15})
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 45)
idx = idx +1
clear_output(wait=True)
plt.show()
try:
cats_df1.set_index('Variable',inplace=True)
except:
pass
cats_df1
df.head()
### Plotly Version of the Pie Chart
if 'status' not in cat_var_list:
cat_var_list.append('status')
# cat_var_list
for cat_vars in cat_var_list:
data = df[cat_vars].value_counts(normalize=True)
labels = data.index.values.astype('str')
count = df[cat_vars].value_counts()
fig = go.Figure(data=[go.Pie(labels=labels,
values=count,
textinfo='percent+value',
insidetextorientation='radial',
rotation=-40,
title=f'<b>{cat_vars}</b>',
hole=0.3
)])
fig.update_layout(
autosize=True,
title=dict(
text=f'<b>Pie Chart for {cat_vars}</b>',
x=0.5,
y=0.9,
font=dict(
family="Arial",
size=15,
color='#000000'
)
)
)
fig.show()
### Principal Component Analysis
from pca import pca
from sklearn.preprocessing import StandardScaler,MinMaxScaler
X,y = df[columns_to_show],df['status']
s_X = MinMaxScaler().fit_transform(X)
pca_model = pca(normalize=False,)
model = pca_model.fit_transform(s_X,col_labels=columns_to_show,verbose=0)
n_pca = model['loadings'].shape[0]
feature_index_pca = [np.abs(model['loadings'].iloc[i]).argmax() for i in range(n_pca)]
pca_f_name = [columns_to_show[feature_index_pca[i]] for i in range(n_pca)]
p_comp_lst = [f"PC{i}" for i in range(n_pca)]
pca_ex_var = model['explained_var']
pca_df = pd.DataFrame({'Principal Components':p_comp_lst,
'Features':pca_f_name,
'Cumulative Explained Variance':pca_ex_var
})
# pca_df['Cumulative Explained Variance %'] = pca_df['Cumulative Explained Variance %'].map(lambda x : round(x*100,4) )
pca_df['Absolute Explained Variance'] = pca_df['Cumulative Explained Variance'].diff()
pca_df['Absolute Explained Variance'].iloc[0] = pca_df['Cumulative Explained Variance'].iloc[0]
display(pca_df)
pca_model.plot();
sns.barplot(y=pca_df['Features'],x=pca_df['Absolute Explained Variance'],orient='h',estimator=max);
# pca_model.biplot(n_feat=10, legend=False);
# pca_model.scatter(legend=True, SPE=True, hotellingt2=True);
# pca_model.scatter3d(legend=True, SPE=True, hotellingt2=True);
# pca_model.biplot3d(n_feat=10, legend=False)
# pca_model.biplot(legend=True, SPE=True, hotellingt2=True)
# pca_model.biplot3d(legend=True, SPE=True, hotellingt2=True)